home *** CD-ROM | disk | FTP | other *** search
- _FINDING STRING DISTANCES_
- by Ray Valdes
-
- [LISTING ONE]
-
- /***********************************************************************
- LEVDIST.C -- Computing the Levenshtein distance (string-to-string edit)
- by Ray Valdes, DDJ April 92
- ***********************************************************************/
-
- #define TRUE 1
- #define FALSE 0
- #define private static
- #define public /**/
- typedef int bool;
-
- private bool verbose_mode = TRUE;
-
- typedef enum { MATCH, INS, DEL, SUB } opcode;
-
- typedef struct
- { int cost;
- char* name;
- int delta_row;
- int delta_col;
- } operation;
-
- #define COST(op) (optable[(int)op].cost) // for convenience
- #define OPSTR(op) (optable[(int)op].name) // for convenience
-
- private operation optable[] = //costs defined on a per-op basis
- {
- /*--cost, name, delta_row, delta_col---------------------------------*/
- { 0, "EQU", -1, -1}, /* a match or no-op backtracks to NorthWest */
- { 1, "INS", 0, -1}, /* insert op backtrack to the West */
- { 1, "DEL", -1, 0}, /* delete op backstracks to the North */
- { 1, "SUB", -1, -1}, /* substitution op backtracks to NorthWest */
- };
-
- typedef struct
- { int distance;
- opcode op;
- } matrix_cell;
-
- #define NUM_ROWS 64
- #define NUM_COLS NUM_ROWS
- #define SIZEOF_STRING NUM_ROWS
-
- private char A [SIZEOF_STRING];
- private char B [SIZEOF_STRING];
- private matrix_cell M [NUM_ROWS] [NUM_COLS]; // this is The Matrix
-
- #define DIST(i,j) (M [(i)][(j)].distance) // for convenience
- /****************************************************************/
-
- private void say_hello (void);
- private bool get_strings (void);
- private void initialize_matrix (void);
- private void calculate_cell (int row,int col);
- private void print_cell (int row,int col);
- private void calculate_matrix (int num_rows,int num_cols);
- private void backtrack_matrix (int num_rows,int num_cols);
- /****************************************************************/
- public int
- main(int argc,char **argv)
- { say_hello();
- while(get_strings())
- { initialize_matrix();
- calculate_matrix(strlen(A),strlen(B));
- backtrack_matrix(strlen(A),strlen(B));
- }
- return 0;
- }
- /****************************************************************/
- private void
- say_hello(void)
- { if(verbose_mode) printf("\nLevenshtein distance, V1.0");
- }
- /****************************************************************/
- private bool
- get_strings(void)
- { char buffer[SIZEOF_STRING*3]; //arbitrarily big buffer
-
- printf("\nEnter first string > "); gets(buffer);
- if(buffer[0]=='\0') return FALSE;
- strcpy(A,buffer);
- printf("\nEnter second string > "); gets(buffer);
- if(buffer[0]=='\0') return FALSE;
- strcpy(B,buffer);
- return TRUE;
- }
- /****************************************************************/
- private void
- initialize_matrix(void)
- { int row,col;
-
- for(row=0,col=0; col<NUM_COLS; col++) // initialize the first row
- {
- M [row][col].distance = col;
- M [row][col].op = INS;
- }
- for(row=0,col=0; row<NUM_ROWS; row++) // initialize the first column
- {
- M [row][col].distance = row;
- M [row][col].op = DEL;
- }
- }
- /****************************************************************/
- private void
- calculate_cell(int row,int col)
-
- { int dNorthWest = DIST(row-1, col-1);
- int dWest = DIST(row , col-1);
- int dNorth = DIST(row-1, col );
-
- if(dWest < dNorth)
- { if(dWest < dNorthWest)
- { M [row][col].op = INS;
- M [row][col].distance = dWest + COST(INS);
- }
- else // dNorthWest <= dWest < dNorth
- { opcode op;
- op = ( A[row-1]==B[col-1] ) ? MATCH : SUB;
- M [row][col].op = op;
- M [row][col].distance = dNorthWest + COST(op);
- }
- }
- else // dNorth <= dWest
- { if(dNorth < dNorthWest)
- { M [row][col].op = DEL;
- M [row][col].distance = dNorth + COST(DEL);
- }
- else // dNorthWest <= dNorth <= dWest
- { opcode op;
- op = ( A[row-1]==B[col-1] ) ? MATCH : SUB;
- M [row][col].op = op;
- M [row][col].distance = dNorthWest + COST(op);
- }
- }
- }
- /****************************************************************/
- private void
- calculate_matrix(int num_rows,int num_cols)
- { int row,col;
-
- if(verbose_mode)
- { printf("\n\\\n \\COL |");
- for(col=0; col < num_cols+1; col++)
- printf("____%d____|");
- printf("\nROW \\ | |");
- for(col=1; col < num_cols+1; col++)
- printf(" %c |", B [col-1]);
- printf("\n 0 \\|");
- for(row=0,col=0; col < num_cols+1; col++)
- print_cell(row,col);
- }
- for(row=1; row<num_rows+1; row++)
- { if(verbose_mode) printf("\n% 2d %c |",row, A [row-1]);
- print_cell(row,0);
- for(col=1; col<num_cols+1; col++)
- {
- calculate_cell(row,col);
- if(verbose_mode) print_cell(row,col);
- }
- }
- }
-
- /****************************************************************/
- private void
- print_cell(int row,int col)
- { printf(" %d %s ",DIST(row,col),OPSTR( M [row][col].op));
- }
- /****************************************************************/
- private void
- backtrack_matrix(int num_rows,int num_cols)
- { int dx,dy;
- int i,j;
- int row = num_rows;
- int col = num_cols;
-
- printf("\n\nLevenshtein distance is %d.\n",DIST(row,col));
- printf("\nBacktrace: %s",A);
-
- while(row>0 || col>0)
- { if( ( M [row][col].op != MATCH) && verbose_mode)
- { printf("\nD(%d,%d)=%d ", row,col,DIST(row,col));
- printf("%s --> ",OPSTR( M [row][col].op));
- for(i=1;i<row-(M[row][col].op==DEL?1:0);i++)
- printf("%c", A[i-1]);
- /*printf("_");*/
- for(j=col-(M[row][col].op==INS?1:0);j<num_cols+1;j++)
- printf("%c", B[j-1]);
- }
- dy = optable[(int)( M [row][col].op)].delta_row;
- dx = optable[(int)( M [row][col].op)].delta_col;
- row += dy;
- col += dx;
- }
- }
-
-